import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from numpy import histogram
import plotly.io as pio
import plotly.express as px
import plotly.figure_factory as ff
from langdetect import detect
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
from imblearn.over_sampling import RandomOverSampler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import BaggingClassifier,VotingClassifier,RandomForestClassifier,GradientBoostingClassifier
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import VotingRegressor
from category_encoders.binary import BinaryEncoder
from sklearn.preprocessing import RobustScaler, StandardScaler, PolynomialFeatures
from sklearn.metrics import accuracy_score,classification_report
from sklearn.metrics import r2_score
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
sns.set(rc={'figure.figsize': [10, 10]}, font_scale=1.3)
'''
This dataset comprises more than 3000 hotel reviews collected from various countries.
this dataset collected on the 22 September 2023 and its contains Reviews from 2012 to 2023.
I am sharing a glimpse of a work I completed scraping hundreds of hotels' reviews.
The data was obtained through web scraping techniques using Selenium and BeautifulSoup.
'''
df = pd.read_csv("F:/Local Disk (D)/AI-python-EPSLION/final project 2 nov 2023/my datasett/my dataset/DataSet/global_hotel_reviews.csv")
df.head(5)
| Rating | Date | Description | Hotel_name | City | Country | |
|---|---|---|---|---|---|---|
| 0 | NaN | 18-09-2023 | Très bon hôtel comme attendu. On s’occupe de v... | barriere-le-majestic | Cannes | France |
| 1 | 10.0 | 5/9/2023 | NaN | barriere-le-majestic | Cannes | France |
| 2 | 8.0 | 31-10-2022 | personnel à l'écoute et agréable, mais les cha... | barriere-le-majestic | Cannes | France |
| 3 | 10.0 | 3/9/2022 | Accueil chaleureux, bon service et bonne nourr... | barriere-le-majestic | Cannes | France |
| 4 | 10.0 | 7/12/2021 | Excellent Hotel bien situé, excellent service ... | barriere-le-majestic | Cannes | France |
"""
# using df.info() to check how many values is null so here Description has 3810 - 3799
= 11 missing values
Country = 3810 - 3796 = 14 missing value
Rating = 3810 - 3793 = 17 missing value
"""
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 3810 entries, 0 to 3809 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Rating 3793 non-null float64 1 Date 3810 non-null object 2 Description 3799 non-null object 3 Hotel_name 3810 non-null object 4 City 3810 non-null object 5 Country 3796 non-null object dtypes: float64(1), object(5) memory usage: 178.7+ KB
"""
use this commnd to get a descriptive statistics summary of a given dataframe.
describe command for categorical values only.
the most common hotel to be visited is Grand Fiesta Americana in Mexico country particularly in Cancun.
"""
df.describe(include = 'all')
| Rating | Date | Description | Hotel_name | City | Country | |
|---|---|---|---|---|---|---|
| count | 3793.000000 | 3810 | 3799 | 3810 | 3810 | 3796 |
| unique | NaN | 1730 | 3694 | 6 | 5 | 3 |
| top | NaN | 22-08-2022 | Excellent | Grand Fiesta Americana | Cancun | Mexico |
| freq | NaN | 11 | 9 | 979 | 1592 | 1586 |
| mean | 8.851041 | NaN | NaN | NaN | NaN | NaN |
| std | 1.893585 | NaN | NaN | NaN | NaN | NaN |
| min | 2.000000 | NaN | NaN | NaN | NaN | NaN |
| 25% | 8.000000 | NaN | NaN | NaN | NaN | NaN |
| 50% | 10.000000 | NaN | NaN | NaN | NaN | NaN |
| 75% | 10.000000 | NaN | NaN | NaN | NaN | NaN |
| max | 10.000000 | NaN | NaN | NaN | NaN | NaN |
#check dublication
df.duplicated().sum()
54
#removing all the duplication before treating with missing data.
df.drop_duplicates(inplace=True)
df.duplicated().sum()
0
'''
Make some of feature engineering to extract (day and month and year and time) seperately from dates.
here i make new variable called (NewDate) in order to amend format date.
then i will drop column "Date".
'''
df['NewDate'] = pd.to_datetime(df.Date,format='mixed')
df.NewDate.value_counts()
NewDate
2022-05-07 10
2022-09-07 10
2023-03-09 10
2023-09-05 10
2023-09-18 9
..
2020-11-08 1
2023-04-25 1
2021-05-01 1
2021-11-01 1
2023-07-27 1
Name: count, Length: 1730, dtype: int64
df['NewDate']
0 2023-09-18
1 2023-05-09
2 2022-10-31
3 2022-03-09
4 2021-07-12
...
3805 2021-04-08
3806 2021-02-08
3807 2021-02-08
3808 2021-02-08
3809 2021-02-08
Name: NewDate, Length: 3756, dtype: datetime64[ns]
df.head(5)
| Rating | Date | Description | Hotel_name | City | Country | NewDate | |
|---|---|---|---|---|---|---|---|
| 0 | NaN | 18-09-2023 | Très bon hôtel comme attendu. On s’occupe de v... | barriere-le-majestic | Cannes | France | 2023-09-18 |
| 1 | 10.0 | 5/9/2023 | NaN | barriere-le-majestic | Cannes | France | 2023-05-09 |
| 2 | 8.0 | 31-10-2022 | personnel à l'écoute et agréable, mais les cha... | barriere-le-majestic | Cannes | France | 2022-10-31 |
| 3 | 10.0 | 3/9/2022 | Accueil chaleureux, bon service et bonne nourr... | barriere-le-majestic | Cannes | France | 2022-03-09 |
| 4 | 10.0 | 7/12/2021 | Excellent Hotel bien situé, excellent service ... | barriere-le-majestic | Cannes | France | 2021-07-12 |
#then i will drop column "Date".
df.drop('Date', axis=1, inplace=True)
df.head(10)
| Rating | Description | Hotel_name | City | Country | NewDate | |
|---|---|---|---|---|---|---|
| 0 | NaN | Très bon hôtel comme attendu. On s’occupe de v... | barriere-le-majestic | Cannes | France | 2023-09-18 |
| 1 | 10.0 | NaN | barriere-le-majestic | Cannes | France | 2023-05-09 |
| 2 | 8.0 | personnel à l'écoute et agréable, mais les cha... | barriere-le-majestic | Cannes | France | 2022-10-31 |
| 3 | 10.0 | Accueil chaleureux, bon service et bonne nourr... | barriere-le-majestic | Cannes | France | 2022-03-09 |
| 4 | 10.0 | Excellent Hotel bien situé, excellent service ... | barriere-le-majestic | Cannes | France | 2021-07-12 |
| 5 | 10.0 | Un sejour plus que parfait dans un hôtel d'exc... | barriere-le-majestic | Cannes | France | 2021-11-21 |
| 6 | 10.0 | Parfait, une fois sur place, nous avons été su... | barriere-le-majestic | Cannes | NaN | 2021-10-17 |
| 7 | 10.0 | Un automne incroyable on appelle ça un palace ... | barriere-le-majestic | Cannes | France | 2021-03-09 |
| 8 | 10.0 | Beautiful hotel, kind and lovely staff , amazi... | barriere-le-majestic | Cannes | NaN | 2021-06-21 |
| 9 | 10.0 | Personnel très accueillant | barriere-le-majestic | Cannes | France | 2021-04-06 |
df.NewDate.value_counts()
NewDate
2022-05-07 10
2022-09-07 10
2023-03-09 10
2023-09-05 10
2023-09-18 9
..
2020-11-08 1
2023-04-25 1
2021-05-01 1
2021-11-01 1
2023-07-27 1
Name: count, Length: 1730, dtype: int64
# null data here in description and country and rating.
df.isna().sum()
Rating 17 Description 11 Hotel_name 0 City 0 Country 14 NewDate 0 dtype: int64
''' if you want to know the null and missing values for particular feature wrtie the below command.
here there is 11 missing values in Description.
in country has 14 missing values.
'''
df['Description'].isnull().sum()
11
df['Country'].isnull().sum()
14
'''
here to deal with missing values :
fill missng data.
For Categorical Data ==> Fill with Mode (Most frequent).
for numerical data ==> fill with mean or median depend on is there any outliers or not.
by using this command you can know which most country visited more by people is {Mexico}.
'''
df['Country'].mode()
0 Mexico Name: Country, dtype: object
'''
here i make fillna to Country with mode which is most frequent value so data that were missed
is now be replaced with Country "Mexico" that being visited more.
'''
df['Country'].fillna(df['Country'].mode()[0], inplace=True)
'''
to make sure kindly write below command in order to check if still are missing values or not.
so its observed that the country not having any missing values.
'''
df['Country'].isnull().sum()
0
df['Description'].mode()
0 Excellent Name: Description, dtype: object
'''
here i make fillna to Description with mode which is most frequent value so data that were missed
is now be replaced with Description "Excellent" .
'''
df['Description'].fillna(df['Description'].mode()[0], inplace=True)
df['Description'].isnull().sum()
0
'''
Rating ==>numerical data ==> how to deal missing values in this column?
1) confirm if there is outliers by using boxplot first.
2) if there are outliers ; the fillna will made using Median (as median not being affected by outliers).
3) if there are not outliers ; the fillna can be made by both mean and median.
===> Here in box plot ; i will accept outliers who lower than lowerfence as its not affected 3810.
as outliers here is 119 record and its normal vaues so its not affected ; anyway i will fillna with median.
but if the ouliers was affect data so in this case you can drop outliers by using IQR.
'''
'\n Rating ==>numerical data ==> how to deal missing values in this column?\n \n 1) confirm if there is outliers by using boxplot first.\n 2) if there are outliers ; the fillna will made using Median (as median not being affected by outliers).\n 3) if there are not outliers ; the fillna can be made by both mean and median.\n ===> Here in box plot ; i will accept outliers who lower than lowerfence as its not affected 3810.\nas outliers here is 119 record and its normal vaues so its not affected ; anyway i will fillna with median.\n\nbut if the ouliers was affect data so in this case you can drop outliers by using IQR.\n '
fig = px.box(df, y='Rating')
fig.show()
'''
If ouliers was affect data by using IQR using the below commands:
IQR (Inter Quartile Range) Inter Quartile Range approach to finding the outliers is the most commonly used and most trusted approach used in the research field.
IQR = Quartile3 – Quartile1
Q1 = 8
Q3 = 10
lower = 5
upper = 13
# IQR
Q1 = df['Rating'].quantile(0.25)
Q3 = df['Rating'].quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - 1.5*IQR
upper = Q3 + 1.5*IQR
upper_array = np.where(df['Rating']>=upper)[0]
lower_array = np.where(df['Rating']<=lower)[0]
Removing the outliers
df = df.drop(index=upper_array, inplace=True,axis=1)
df.drop(index=lower_array,inplace=True,axis=1)
but here i will accept outliers as they are 119 from 3810 so its accepted so the fillna with mean will be occured.
'''
"\nIf ouliers was affect data by using IQR using the below commands:\n\nIQR (Inter Quartile Range) Inter Quartile Range approach to finding the outliers is the most commonly used and most trusted approach used in the research field.\n\nIQR = Quartile3 – Quartile1\n\nQ1 = 8\nQ3 = 10\n\nlower = 5\nupper = 13\n\n# IQR\nQ1 = df['Rating'].quantile(0.25) \nQ3 = df['Rating'].quantile(0.75)\nIQR = Q3 - Q1\nlower = Q1 - 1.5*IQR\nupper = Q3 + 1.5*IQR\n\nupper_array = np.where(df['Rating']>=upper)[0]\nlower_array = np.where(df['Rating']<=lower)[0]\nRemoving the outliers\n\ndf = df.drop(index=upper_array, inplace=True,axis=1)\ndf.drop(index=lower_array,inplace=True,axis=1)\n\nbut here i will accept outliers as they are 119 from 3810 so its accepted so the fillna with mean will be occured.\n\n"
'''
here median of rating is 10 but the result of df['Rating'].mean() is equal to "8.854774003744318" so if i fillna with mean
this is means that the missing value in Rating will be "8.854774003744318" and there is no Rating review float it be intiger
so i will fillna with median.
'''
df['Rating'].median()
10.0
'''
i can make fillna to Rating with mean or meadian as outliers is not affect.
here i make fillna to Rating with median .
'''
df['Rating'].fillna(df['Rating'].median(), inplace=True)
df.isna().sum()
Rating 0 Description 0 Hotel_name 0 City 0 Country 0 NewDate 0 dtype: int64
'''
first record previoulsy was missing value so now its replaced with median which is 10 in this case
'''
df['Rating']
0 10.0
1 10.0
2 8.0
3 10.0
4 10.0
...
3805 6.0
3806 8.0
3807 8.0
3808 10.0
3809 10.0
Name: Rating, Length: 3756, dtype: float64
df
| Rating | Description | Hotel_name | City | Country | NewDate | |
|---|---|---|---|---|---|---|
| 0 | 10.0 | Très bon hôtel comme attendu. On s’occupe de v... | barriere-le-majestic | Cannes | France | 2023-09-18 |
| 1 | 10.0 | Excellent | barriere-le-majestic | Cannes | France | 2023-05-09 |
| 2 | 8.0 | personnel à l'écoute et agréable, mais les cha... | barriere-le-majestic | Cannes | France | 2022-10-31 |
| 3 | 10.0 | Accueil chaleureux, bon service et bonne nourr... | barriere-le-majestic | Cannes | France | 2022-03-09 |
| 4 | 10.0 | Excellent Hotel bien situé, excellent service ... | barriere-le-majestic | Cannes | France | 2021-07-12 |
| ... | ... | ... | ... | ... | ... | ... |
| 3805 | 6.0 | Localização e praia são os diferenciais. Atend... | InterContinental Presidente | Cancun | Mexico | 2021-04-08 |
| 3806 | 8.0 | Excellent | InterContinental Presidente | Cancun | Mexico | 2021-02-08 |
| 3807 | 8.0 | Sistema de café da manhã e happy hour mal expl... | InterContinental Presidente | Cancun | Mexico | 2021-02-08 |
| 3808 | 10.0 | Playa espectacular | InterContinental Presidente | Cancun | Mexico | 2021-02-08 |
| 3809 | 10.0 | Presidente Intercontinental was a great experi... | InterContinental Presidente | Cancun | Mexico | 2021-02-08 |
3756 rows × 6 columns
'''
from here i observe the less rating is (2) and rating (4).
and the more Rating is (10) and (8)
'''
df.Rating.value_counts()
Rating 10.0 2421 8.0 838 6.0 278 4.0 129 2.0 90 Name: count, dtype: int64
'''
1) first univariate:
here i use univariate plots using histogram.
from here i observe the less rating is (2) and rating (4).
and the more Rating is (10) and (8)
'''
fig = px.histogram(df,x='Rating')
fig.show()
df.info()
<class 'pandas.core.frame.DataFrame'> Index: 3756 entries, 0 to 3809 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Rating 3756 non-null float64 1 Description 3756 non-null object 2 Hotel_name 3756 non-null object 3 City 3756 non-null object 4 Country 3756 non-null object 5 NewDate 3756 non-null datetime64[ns] dtypes: datetime64[ns](1), float64(1), object(4) memory usage: 205.4+ KB
'''
The most attarctive hotel is "Grand Fiesta Americana" that located in [Mexico] particularly in 'Cancun'.
the second hotel attart visitors is "Warwick Geneva" that located in [Switzerland] and particularly in 'Geneva'.
the less attarctive hotels is "Hyatt Regency Palais" that located in [France] particularly in 'NICE' and
another bad hotel in [france] also which is "Fairmont Monte Carlo" in "monaco" city.
'''
df.Hotel_name.value_counts()
Hotel_name Grand Fiesta Americana 979 Warwick Geneva 646 InterContinental Presidente 581 barriere-le-majestic 574 Fairmont Monte Carlo 495 Hyatt Regency Palais 481 Name: count, dtype: int64
'''
2) second univariate:
The most attarctive hotel is "Grand Fiesta Americana" that located in [Mexico] particularly in 'Cancun'.
the second hotel attart visitors is "Warwick Geneva" that located in [Switzerland] and particularly in 'Geneva'.
the less attarctive hotels is "Hyatt Regency Palais" that located in [France] particularly in 'NICE' and
another bad hotel in [france] also which is "Fairmont Monte Carlo" in "monaco" city.
'''
fig = px.histogram(df,x='Hotel_name',color_discrete_sequence = ['green'])
fig.show()
'''
The city that is considered as the most tourist destination is [Cancun] in 'Mexico' followed by
[Geneva] in 'Switzerland'
on the other hand the less cities that considered as tourist destination is [Nice] and [Monaco] in france.
'''
df.City.value_counts()
City Cancun 1560 Geneva 646 Cannes 574 Monaco 495 Nice 481 Name: count, dtype: int64
'''
3) third univariate:
The city that is considered as the most tourist destination is [Cancun] in 'Mexico' followed by
[Geneva] in 'Switzerland'
on the other hand the less cities that considered as tourist destination is [Nice] and [Monaco] in france.
'''
fig = px.histogram(df,x='City',color_discrete_sequence = ['red'])
fig.show()
'''
From the prevoius analysis we already be know that that most tourist destination country is "Mexico"
and the less one is "Switzerland"
'''
df.Country.value_counts()
Country Mexico 1568 France 1547 Switzerland 641 Name: count, dtype: int64
'''
4) forth univariate:
The most tourist destination country is "Mexico"
and the less one is "Switzerland"
'''
fig = px.histogram(df,x='Country',color_discrete_sequence = ['grey'])
fig.show()
'''
from here i need to make features engineers to extract day and month and year in order to analysis data meaning that
i want to know which the more year and day and month and season that being better to visit the mentioned countries.
'''
df['NewDate']
0 2023-09-18
1 2023-05-09
2 2022-10-31
3 2022-03-09
4 2021-07-12
...
3805 2021-04-08
3806 2021-02-08
3807 2021-02-08
3808 2021-02-08
3809 2021-02-08
Name: NewDate, Length: 3756, dtype: datetime64[ns]
df['Day'] = df['NewDate'].apply(lambda x: str(x).split(' ')[0].split('-')[-1])
df.head(5)
| Rating | Description | Hotel_name | City | Country | NewDate | Day | |
|---|---|---|---|---|---|---|---|
| 0 | 10.0 | Très bon hôtel comme attendu. On s’occupe de v... | barriere-le-majestic | Cannes | France | 2023-09-18 | 18 |
| 1 | 10.0 | Excellent | barriere-le-majestic | Cannes | France | 2023-05-09 | 09 |
| 2 | 8.0 | personnel à l'écoute et agréable, mais les cha... | barriere-le-majestic | Cannes | France | 2022-10-31 | 31 |
| 3 | 10.0 | Accueil chaleureux, bon service et bonne nourr... | barriere-le-majestic | Cannes | France | 2022-03-09 | 09 |
| 4 | 10.0 | Excellent Hotel bien situé, excellent service ... | barriere-le-majestic | Cannes | France | 2021-07-12 | 12 |
'''
Extract month ==> feature engineer
'''
df['Month'] = df['NewDate'].apply(lambda x: str(x).split(' ')[0].split('-')[1])
df.head(5)
| Rating | Description | Hotel_name | City | Country | NewDate | Day | Month | |
|---|---|---|---|---|---|---|---|---|
| 0 | 10.0 | Très bon hôtel comme attendu. On s’occupe de v... | barriere-le-majestic | Cannes | France | 2023-09-18 | 18 | 09 |
| 1 | 10.0 | Excellent | barriere-le-majestic | Cannes | France | 2023-05-09 | 09 | 05 |
| 2 | 8.0 | personnel à l'écoute et agréable, mais les cha... | barriere-le-majestic | Cannes | France | 2022-10-31 | 31 | 10 |
| 3 | 10.0 | Accueil chaleureux, bon service et bonne nourr... | barriere-le-majestic | Cannes | France | 2022-03-09 | 09 | 03 |
| 4 | 10.0 | Excellent Hotel bien situé, excellent service ... | barriere-le-majestic | Cannes | France | 2021-07-12 | 12 | 07 |
df['Month']
0 09
1 05
2 10
3 03
4 07
..
3805 04
3806 02
3807 02
3808 02
3809 02
Name: Month, Length: 3756, dtype: object
#here i convert Month to be numeric in order to extrat feature engineer which is season so in function i have to use numeric values.
df["Month"]= pd.to_numeric(df["Month"],errors='coerce')
df.head(5)
| Rating | Description | Hotel_name | City | Country | NewDate | Day | Month | |
|---|---|---|---|---|---|---|---|---|
| 0 | 10.0 | Très bon hôtel comme attendu. On s’occupe de v... | barriere-le-majestic | Cannes | France | 2023-09-18 | 18 | 9 |
| 1 | 10.0 | Excellent | barriere-le-majestic | Cannes | France | 2023-05-09 | 09 | 5 |
| 2 | 8.0 | personnel à l'écoute et agréable, mais les cha... | barriere-le-majestic | Cannes | France | 2022-10-31 | 31 | 10 |
| 3 | 10.0 | Accueil chaleureux, bon service et bonne nourr... | barriere-le-majestic | Cannes | France | 2022-03-09 | 09 | 3 |
| 4 | 10.0 | Excellent Hotel bien situé, excellent service ... | barriere-le-majestic | Cannes | France | 2021-07-12 | 12 | 7 |
def months(x):
if x in [12, 1, 2]:
return 'Winter'
elif x in [3, 4, 5]:
return 'Spring'
elif x in [6, 7, 8]:
return 'Summer'
elif x in [9, 10, 11]:
return 'Autumn'
df['Season'] = df['Month'].apply(months)
df.head(5)
| Rating | Description | Hotel_name | City | Country | NewDate | Day | Month | Season | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 10.0 | Très bon hôtel comme attendu. On s’occupe de v... | barriere-le-majestic | Cannes | France | 2023-09-18 | 18 | 9 | Autumn |
| 1 | 10.0 | Excellent | barriere-le-majestic | Cannes | France | 2023-05-09 | 09 | 5 | Spring |
| 2 | 8.0 | personnel à l'écoute et agréable, mais les cha... | barriere-le-majestic | Cannes | France | 2022-10-31 | 31 | 10 | Autumn |
| 3 | 10.0 | Accueil chaleureux, bon service et bonne nourr... | barriere-le-majestic | Cannes | France | 2022-03-09 | 09 | 3 | Spring |
| 4 | 10.0 | Excellent Hotel bien situé, excellent service ... | barriere-le-majestic | Cannes | France | 2021-07-12 | 12 | 7 | Summer |
'''
Extract year ==> feature engineer
'''
df['year'] = df['NewDate'].apply(lambda x: str(x).split(' ')[0].split('-')[0])
df.head(5)
| Rating | Description | Hotel_name | City | Country | NewDate | Day | Month | Season | year | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 10.0 | Très bon hôtel comme attendu. On s’occupe de v... | barriere-le-majestic | Cannes | France | 2023-09-18 | 18 | 9 | Autumn | 2023 |
| 1 | 10.0 | Excellent | barriere-le-majestic | Cannes | France | 2023-05-09 | 09 | 5 | Spring | 2023 |
| 2 | 8.0 | personnel à l'écoute et agréable, mais les cha... | barriere-le-majestic | Cannes | France | 2022-10-31 | 31 | 10 | Autumn | 2022 |
| 3 | 10.0 | Accueil chaleureux, bon service et bonne nourr... | barriere-le-majestic | Cannes | France | 2022-03-09 | 09 | 3 | Spring | 2022 |
| 4 | 10.0 | Excellent Hotel bien situé, excellent service ... | barriere-le-majestic | Cannes | France | 2021-07-12 | 12 | 7 | Summer | 2021 |
df.info()
<class 'pandas.core.frame.DataFrame'> Index: 3756 entries, 0 to 3809 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Rating 3756 non-null float64 1 Description 3756 non-null object 2 Hotel_name 3756 non-null object 3 City 3756 non-null object 4 Country 3756 non-null object 5 NewDate 3756 non-null datetime64[ns] 6 Day 3756 non-null object 7 Month 3756 non-null int64 8 Season 3756 non-null object 9 year 3756 non-null object dtypes: datetime64[ns](1), float64(1), int64(1), object(7) memory usage: 322.8+ KB
#here i convert day and year to be numeric values.
df["Day"]= pd.to_numeric(df["Day"],errors='coerce')
df["year"]= pd.to_numeric(df["year"],errors='coerce')
df.info()
<class 'pandas.core.frame.DataFrame'> Index: 3756 entries, 0 to 3809 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Rating 3756 non-null float64 1 Description 3756 non-null object 2 Hotel_name 3756 non-null object 3 City 3756 non-null object 4 Country 3756 non-null object 5 NewDate 3756 non-null datetime64[ns] 6 Day 3756 non-null int64 7 Month 3756 non-null int64 8 Season 3756 non-null object 9 year 3756 non-null int64 dtypes: datetime64[ns](1), float64(1), int64(3), object(5) memory usage: 322.8+ KB
df.Season.value_counts()
Season Summer 1206 Autumn 935 Spring 889 Winter 726 Name: count, dtype: int64
''''
5) fifth univariate:
here the preffered season is summer followed by autumn.
and the worst season for tourism is winter.
'''
fig = px.histogram(df,x='Season',color_discrete_sequence = ['purple'])
fig.show()
df.head()
| Rating | Description | Hotel_name | City | Country | NewDate | Day | Month | Season | year | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 10.0 | Très bon hôtel comme attendu. On s’occupe de v... | barriere-le-majestic | Cannes | France | 2023-09-18 | 18 | 9 | Autumn | 2023 |
| 1 | 10.0 | Excellent | barriere-le-majestic | Cannes | France | 2023-05-09 | 9 | 5 | Spring | 2023 |
| 2 | 8.0 | personnel à l'écoute et agréable, mais les cha... | barriere-le-majestic | Cannes | France | 2022-10-31 | 31 | 10 | Autumn | 2022 |
| 3 | 10.0 | Accueil chaleureux, bon service et bonne nourr... | barriere-le-majestic | Cannes | France | 2022-03-09 | 9 | 3 | Spring | 2022 |
| 4 | 10.0 | Excellent Hotel bien situé, excellent service ... | barriere-le-majestic | Cannes | France | 2021-07-12 | 12 | 7 | Summer | 2021 |
'''
more visitors in the end of first week and [second week] month while less tourist in the start of the first week of month.
'''
df.Day.value_counts()
Day 7 194 9 184 8 175 19 146 6 145 23 137 5 137 18 136 21 136 28 130 24 127 14 126 22 123 26 120 13 119 20 118 25 117 30 117 15 115 4 115 1 112 16 111 27 111 10 109 29 102 12 92 11 88 17 85 31 83 3 74 2 72 Name: count, dtype: int64
df_sort_day = df.sort_values(by='Day')
''''
6) six univariate:
I need to support static exploration about Day which people travel more by using seaborn countplot.
more visitors in the seventh and ninth day of the month.
while the less visitors be in second and third day of the month.
'''
sns.countplot(x='Day',data=df_sort_day)
<Axes: xlabel='Day', ylabel='count'>
df_sort_month = df.sort_values(by='Month')
df.Month.value_counts()
Month 8 452 7 417 9 362 6 337 10 320 4 317 3 296 5 276 11 253 2 253 12 245 1 228 Name: count, dtype: int64
'''
7) seven univariate:
here the most months that attract visitors is [August and July] ; while the less month in terms of visit tourists is [Junuary
and december].
'''
sns.countplot(x='Month',data=df_sort_month)
<Axes: xlabel='Month', ylabel='count'>
df.year.value_counts()
year 2022 995 2023 814 2018 476 2019 475 2021 319 2017 194 2015 171 2020 147 2016 121 2014 23 2013 20 2012 1 Name: count, dtype: int64
df_sort_year = df.sort_values(by='year')
'''
8) eight univariate:
visitors travelled more in 2022 and 2023 while less visitors travel in 2012 and 2013.
'''
fig = px.histogram(df_sort_year,x='year',color_discrete_sequence = ['purple'])
fig.show()
'''
1) first bivariate:
here its observed that Mexico take the maximim rate while switzerland take less rating.
'''
sns.violinplot(x = 'Rating',y = 'Country',data = df)
<Axes: xlabel='Rating', ylabel='Country'>
'''
2) Second bivariate:
another way to visualize between rating and country
here its observed that Mexico take the maximim rate while switzerland take less rating.
'''
px.bar(df,x='Rating',y='Country')
'''
3) third bivariate:
here observed that hotel that take most of the rating is Grand Fiesta Americana" that
located in [Mexico] particularly in 'Cancun' city.
while hotels that took less rating are :
"Hyatt Regency Palais" that located in [France] particularly in 'NICE'.
and "Fairmont Monte Carlo" in "monaco" city in [France].
'''
px.bar(df,x='Rating',y='Hotel_name')
'''
4) four bivariate:
here the more rating in Cancun (mexico) while the less rating in 'Nice' and 'Monaco' in france.
'''
fig = px.bar(df,x='Rating',y='City')
fig.show()
df.info()
<class 'pandas.core.frame.DataFrame'> Index: 3756 entries, 0 to 3809 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Rating 3756 non-null float64 1 Description 3756 non-null object 2 Hotel_name 3756 non-null object 3 City 3756 non-null object 4 Country 3756 non-null object 5 NewDate 3756 non-null datetime64[ns] 6 Day 3756 non-null int64 7 Month 3756 non-null int64 8 Season 3756 non-null object 9 year 3756 non-null int64 dtypes: datetime64[ns](1), float64(1), int64(3), object(5) memory usage: 322.8+ KB
'''
5) five bivariate:
from here its observed that more rating in summer season in [Mexico.]
while less rating in season (Autumn) in [Switzerland].
'''
sns.catplot(data=df, x="Rating", y="Country", col="Season", aspect=.5)
<seaborn.axisgrid.FacetGrid at 0x180939cae10>
'''
6) six bivariate:
here in August the more visitors was in Mexico while less visiors in Switzerland in winter.
'''
sns.countplot(x='Month',hue='Country',data=df)
<Axes: xlabel='Month', ylabel='count'>
'''
7) seven bivariate:
here the more rating in seasons (Summer followed by Autumn) and the less rating in season (winter).
'''
sns.countplot(x='Rating',hue='Season',data=df)
<Axes: xlabel='Rating', ylabel='count'>
'''
8) eight bivariate:
'''
px.density_heatmap(df,x='Rating',y='year')
'''
==> In Univariate i use 8 visualization :
6 histogram and two countplot(seaborn).
==> In Bivariate i use 8 visualization which are:
1- sns.violinplot ==> one
2- sns.catplot ==> one
3- px.bar ==> three
4- countplot ==> two
5- px.density_heatmap (plotly) ==> one
==> use px.imshow to show coffecient correlation between Rating(target)and other features.
==> and to detect outliers i use two boxplot ==> that was accepted as its normal case.
and i use 8 different types of visualization and 16 visualization mixed between Univariate and Bivariate.
Brief of this analysis:
- people preferred the hotel "Grand Fiesta Americana in Mexico country particularly in Cancun in summer season.
- they do not prefer "Hyatt Regency Palais" that located in [France] particularly in 'NICE' and also
"Fairmont Monte Carlo" hotel in "monaco" city in also france.
- the country that prefferd is Mexico then Frace then Switherland.
- People travel more in 2022 & 2023 while they didnt prefer to travel or went to hotels in 2012 &2013 and in my opinion that
with the passage of time , people realize the importance of travel and its impact to change your mood ; makes you more able to
be productve in your worl because after vacaion you feel satisfied and have energy that helps you get the work done.
- here the most months that attract visitors is [August and July] ; while the less month in terms of visit tourists is [Junuary
and december].
'''
'\n==> In Univariate i use 8 visualization :\n6 histogram and two countplot(seaborn).\n\n==> In Bivariate i use 8 visualization which are:\n1- sns.violinplot ==> one\n2- sns.catplot ==> one\n3- px.bar ==> three \n4- countplot ==> two\n5- px.density_heatmap (plotly) ==> one\n\n==> use px.imshow to show coffecient correlation between Rating(target)and other features.\n==> and to detect outliers i use two boxplot ==> that was accepted as its normal case.\n\nand i use 8 different types of visualization and 16 visualization mixed between Univariate and Bivariate.\n\nBrief of this analysis:\n\n- people preferred the hotel "Grand Fiesta Americana in Mexico country particularly in Cancun in summer season.\n\n- they do not prefer "Hyatt Regency Palais" that located in [France] particularly in \'NICE\' and also\n"Fairmont Monte Carlo" hotel in "monaco" city in also france.\n\n\n- the country that prefferd is Mexico then Frace then Switherland.\n\n- People travel more in 2022 & 2023 while they didnt prefer to travel or went to hotels in 2012 &2013 and in my opinion that\n\nwith the passage of time , people realize the importance of travel and its impact to change your mood ; makes you more able to\nbe productve in your worl because after vacaion you feel satisfied and have energy that helps you get the work done.\n\n- here the most months that attract visitors is [August and July] ; while the less month in terms of visit tourists is [Junuary \nand december].\n\n'
df
| Rating | Description | Hotel_name | City | Country | NewDate | Day | Month | Season | year | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 10.0 | Très bon hôtel comme attendu. On s’occupe de v... | barriere-le-majestic | Cannes | France | 2023-09-18 | 18 | 9 | Autumn | 2023 |
| 1 | 10.0 | Excellent | barriere-le-majestic | Cannes | France | 2023-05-09 | 9 | 5 | Spring | 2023 |
| 2 | 8.0 | personnel à l'écoute et agréable, mais les cha... | barriere-le-majestic | Cannes | France | 2022-10-31 | 31 | 10 | Autumn | 2022 |
| 3 | 10.0 | Accueil chaleureux, bon service et bonne nourr... | barriere-le-majestic | Cannes | France | 2022-03-09 | 9 | 3 | Spring | 2022 |
| 4 | 10.0 | Excellent Hotel bien situé, excellent service ... | barriere-le-majestic | Cannes | France | 2021-07-12 | 12 | 7 | Summer | 2021 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 3805 | 6.0 | Localização e praia são os diferenciais. Atend... | InterContinental Presidente | Cancun | Mexico | 2021-04-08 | 8 | 4 | Spring | 2021 |
| 3806 | 8.0 | Excellent | InterContinental Presidente | Cancun | Mexico | 2021-02-08 | 8 | 2 | Winter | 2021 |
| 3807 | 8.0 | Sistema de café da manhã e happy hour mal expl... | InterContinental Presidente | Cancun | Mexico | 2021-02-08 | 8 | 2 | Winter | 2021 |
| 3808 | 10.0 | Playa espectacular | InterContinental Presidente | Cancun | Mexico | 2021-02-08 | 8 | 2 | Winter | 2021 |
| 3809 | 10.0 | Presidente Intercontinental was a great experi... | InterContinental Presidente | Cancun | Mexico | 2021-02-08 | 8 | 2 | Winter | 2021 |
3756 rows × 10 columns
'''
here i will drop Description , NewDate , Season
'''
df.drop('Description',axis=1,inplace = True)
df.drop('NewDate',axis=1,inplace = True)
df.drop('Season',axis=1,inplace = True)
'''
use binary encoder to convert categorical to numerical data.
'''
encoder = BinaryEncoder()
New_df = encoder.fit_transform(df[['Hotel_name']])
New_df
| Hotel_name_0 | Hotel_name_1 | Hotel_name_2 | |
|---|---|---|---|
| 0 | 0 | 0 | 1 |
| 1 | 0 | 0 | 1 |
| 2 | 0 | 0 | 1 |
| 3 | 0 | 0 | 1 |
| 4 | 0 | 0 | 1 |
| ... | ... | ... | ... |
| 3805 | 1 | 1 | 0 |
| 3806 | 1 | 1 | 0 |
| 3807 | 1 | 1 | 0 |
| 3808 | 1 | 1 | 0 |
| 3809 | 1 | 1 | 0 |
3756 rows × 3 columns
df = pd.concat([df,New_df] , axis = 1 )
df.head()
| Rating | Hotel_name | City | Country | Day | Month | year | Hotel_name_0 | Hotel_name_1 | Hotel_name_2 | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 10.0 | barriere-le-majestic | Cannes | France | 18 | 9 | 2023 | 0 | 0 | 1 |
| 1 | 10.0 | barriere-le-majestic | Cannes | France | 9 | 5 | 2023 | 0 | 0 | 1 |
| 2 | 8.0 | barriere-le-majestic | Cannes | France | 31 | 10 | 2022 | 0 | 0 | 1 |
| 3 | 10.0 | barriere-le-majestic | Cannes | France | 9 | 3 | 2022 | 0 | 0 | 1 |
| 4 | 10.0 | barriere-le-majestic | Cannes | France | 12 | 7 | 2021 | 0 | 0 | 1 |
df.drop('Hotel_name',axis=1,inplace = True)
df
| Rating | City | Country | Day | Month | year | Hotel_name_0 | Hotel_name_1 | Hotel_name_2 | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 10.0 | Cannes | France | 18 | 9 | 2023 | 0 | 0 | 1 |
| 1 | 10.0 | Cannes | France | 9 | 5 | 2023 | 0 | 0 | 1 |
| 2 | 8.0 | Cannes | France | 31 | 10 | 2022 | 0 | 0 | 1 |
| 3 | 10.0 | Cannes | France | 9 | 3 | 2022 | 0 | 0 | 1 |
| 4 | 10.0 | Cannes | France | 12 | 7 | 2021 | 0 | 0 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 3805 | 6.0 | Cancun | Mexico | 8 | 4 | 2021 | 1 | 1 | 0 |
| 3806 | 8.0 | Cancun | Mexico | 8 | 2 | 2021 | 1 | 1 | 0 |
| 3807 | 8.0 | Cancun | Mexico | 8 | 2 | 2021 | 1 | 1 | 0 |
| 3808 | 10.0 | Cancun | Mexico | 8 | 2 | 2021 | 1 | 1 | 0 |
| 3809 | 10.0 | Cancun | Mexico | 8 | 2 | 2021 | 1 | 1 | 0 |
3756 rows × 9 columns
df=pd.get_dummies(df,columns=['City','Country'],drop_first=True)
df.head(1000)
| Rating | Day | Month | year | Hotel_name_0 | Hotel_name_1 | Hotel_name_2 | City_Cannes | City_Geneva | City_Monaco | City_Nice | Country_Mexico | Country_Switzerland | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 10.0 | 18 | 9 | 2023 | 0 | 0 | 1 | True | False | False | False | False | False |
| 1 | 10.0 | 9 | 5 | 2023 | 0 | 0 | 1 | True | False | False | False | False | False |
| 2 | 8.0 | 31 | 10 | 2022 | 0 | 0 | 1 | True | False | False | False | False | False |
| 3 | 10.0 | 9 | 3 | 2022 | 0 | 0 | 1 | True | False | False | False | False | False |
| 4 | 10.0 | 12 | 7 | 2021 | 0 | 0 | 1 | True | False | False | False | False | False |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1002 | 10.0 | 4 | 10 | 2018 | 0 | 1 | 0 | False | False | True | False | False | False |
| 1003 | 8.0 | 4 | 9 | 2018 | 0 | 1 | 0 | False | False | True | False | False | False |
| 1004 | 8.0 | 4 | 2 | 2018 | 0 | 1 | 0 | False | False | True | False | False | False |
| 1005 | 10.0 | 4 | 2 | 2018 | 0 | 1 | 0 | False | False | True | False | False | False |
| 1006 | 8.0 | 4 | 1 | 2018 | 0 | 1 | 0 | False | False | True | False | False | False |
1000 rows × 13 columns
'''
here make rateing as intiger in order to be intiger number not continuous (not to be 3.5 as a rating)
'''
df["Rating"]= df["Rating"].astype('int32')
df.info()
<class 'pandas.core.frame.DataFrame'> Index: 3756 entries, 0 to 3809 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Rating 3756 non-null int32 1 Day 3756 non-null int64 2 Month 3756 non-null int64 3 year 3756 non-null int64 4 Hotel_name_0 3756 non-null int64 5 Hotel_name_1 3756 non-null int64 6 Hotel_name_2 3756 non-null int64 7 City_Cannes 3756 non-null bool 8 City_Geneva 3756 non-null bool 9 City_Monaco 3756 non-null bool 10 City_Nice 3756 non-null bool 11 Country_Mexico 3756 non-null bool 12 Country_Switzerland 3756 non-null bool dtypes: bool(6), int32(1), int64(6) memory usage: 242.1 KB
df["Rating"].value_counts()
Rating 10 2421 8 838 6 278 4 129 2 90 Name: count, dtype: int64
'''
This technique to display the correlation coefficients for different variables.
this tool to identify and visualize patterns in the given data
'''
corr = df.corr()
corr
| Rating | Day | Month | year | Hotel_name_0 | Hotel_name_1 | Hotel_name_2 | City_Cannes | City_Geneva | City_Monaco | City_Nice | Country_Mexico | Country_Switzerland | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Rating | 1.000000 | -0.037824 | -0.006829 | 0.069821 | 0.001417 | -0.048482 | 0.153446 | 0.025296 | -0.097972 | -0.075092 | 0.046688 | 0.077281 | -0.098913 |
| Day | -0.037824 | 1.000000 | 0.055903 | -0.035756 | 0.011670 | -0.001068 | -0.017698 | -0.027459 | 0.033057 | 0.002970 | 0.009368 | -0.010508 | 0.029743 |
| Month | -0.006829 | 0.055903 | 1.000000 | -0.079366 | -0.052631 | 0.083725 | -0.031830 | -0.014821 | -0.027512 | 0.048011 | 0.044900 | -0.033569 | -0.024199 |
| year | 0.069821 | -0.035756 | -0.079366 | 1.000000 | 0.489032 | 0.047524 | 0.075642 | -0.433572 | -0.247107 | -0.155508 | -0.096209 | 0.676796 | -0.246884 |
| Hotel_name_0 | 0.001417 | 0.011670 | -0.052631 | 0.489032 | 1.000000 | -0.366062 | -0.234024 | -0.506691 | 0.382031 | -0.464798 | -0.457197 | 0.706308 | 0.380245 |
| Hotel_name_1 | -0.048482 | -0.001068 | 0.083725 | 0.047524 | -0.366062 | 1.000000 | -0.392813 | -0.357386 | -0.383502 | 0.463015 | 0.455444 | -0.075610 | -0.381708 |
| Hotel_name_2 | 0.153446 | -0.017698 | -0.031830 | 0.075642 | -0.234024 | -0.392813 | 1.000000 | 0.390793 | -0.495330 | -0.423434 | 0.352621 | 0.143973 | -0.493014 |
| City_Cannes | 0.025296 | -0.027459 | -0.014821 | -0.433572 | -0.506691 | -0.357386 | 0.390793 | 1.000000 | -0.193572 | -0.165475 | -0.162769 | -0.355045 | -0.192666 |
| City_Geneva | -0.097972 | 0.033057 | -0.027512 | -0.247107 | 0.382031 | -0.383502 | -0.495330 | -0.193572 | 1.000000 | -0.177567 | -0.174664 | -0.378667 | 0.995323 |
| City_Monaco | -0.075092 | 0.002970 | 0.048011 | -0.155508 | -0.464798 | 0.463015 | -0.423434 | -0.165475 | -0.177567 | 1.000000 | -0.149312 | -0.329820 | -0.176737 |
| City_Nice | 0.046688 | 0.009368 | 0.044900 | -0.096209 | -0.457197 | 0.455444 | 0.352621 | -0.162769 | -0.174664 | -0.149312 | 1.000000 | -0.324426 | -0.173847 |
| Country_Mexico | 0.077281 | -0.010508 | -0.033569 | 0.676796 | 0.706308 | -0.075610 | 0.143973 | -0.355045 | -0.378667 | -0.329820 | -0.324426 | 1.000000 | -0.384016 |
| Country_Switzerland | -0.098913 | 0.029743 | -0.024199 | -0.246884 | 0.380245 | -0.381708 | -0.493014 | -0.192666 | 0.995323 | -0.176737 | -0.173847 | -0.384016 | 1.000000 |
'''
here i use high correlation filter and from the below its show that there is better correlation between year and Rating than day
and month so i will drop day and month as thier correlation with rating equal negative .
'''
fig=px.imshow(corr,text_auto=True)
fig.update_layout(width=1000,height=800)
df.drop('Day',axis=1,inplace = True)
df.drop('Month',axis=1,inplace = True)
'''
Target here is Rating which is numeric.
'''
x=df.drop('Rating',axis=1)
y = df['Rating']
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=0,test_size=0.2)
scaler=StandardScaler()
x_train=scaler.fit_transform(x_train)
x_test=scaler.transform(x_test)
HyperParameter Tunning directly affects model performance.
==> each model has its hyperparameter and each parameter has number of different values so the question now which the best values of this parameter that achieve the best performance and accuracy.
HyperParameter Tunning such as : GridsearchCV and Randomized Search.
1) GridsearchCV: ==> i will use this teqnique (more accuracy here on model SVC as its the better model result. GridSearchCV exhaustively considers all parameter combinations.
GridsearchCV advantage: more accuracy than Randomized Search as in Gridsearch try all the possibilities in value range and runnig models depend on number of possiblilties.
GridsearchCV disadvantage: take alot of time and high cost.
The GridSearchCV instance implements the usual estimator API: when “fitting” it on a dataset all the possible combinations of parameter values are evaluated and the best combination is retained.
2) Randomized Search: here try random values and learn random models.
Randomized Search advantage: cost and time redduction.
Randomized Search disadvantage: less accuracy than GridsearchCV .
'''
HyperParameter Tunning (GridSearch)
1) For SVC
'''
model=SVC()
params = [
{'C':[1, 10], 'kernel':['linear', 'sigmoid', 'poly'],'random_state':range(0,10)},
{'C':[1, 10], 'kernel':['rbf'], 'gamma':[0.5, 0.6, 0.7, 0.1, 0.01, 0.01],'random_state':range(0,10)}
]
grid_search_svc=GridSearchCV(estimator=model,
param_grid=params,
scoring='accuracy',
n_jobs=-1)
grid_search_svc.fit(x_train,y_train)
GridSearchCV(estimator=SVC(), n_jobs=-1,
param_grid=[{'C': [1, 10], 'kernel': ['linear', 'sigmoid', 'poly'],
'random_state': range(0, 10)},
{'C': [1, 10],
'gamma': [0.5, 0.6, 0.7, 0.1, 0.01, 0.01],
'kernel': ['rbf'], 'random_state': range(0, 10)}],
scoring='accuracy')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(estimator=SVC(), n_jobs=-1,
param_grid=[{'C': [1, 10], 'kernel': ['linear', 'sigmoid', 'poly'],
'random_state': range(0, 10)},
{'C': [1, 10],
'gamma': [0.5, 0.6, 0.7, 0.1, 0.01, 0.01],
'kernel': ['rbf'], 'random_state': range(0, 10)}],
scoring='accuracy')SVC()
SVC()
'''
best parms here is that when C:1 and gamma eq 0.1 and kernel eq 'rbf ==>default'
'''
grid_search_svc.best_params_
{'C': 1, 'gamma': 0.1, 'kernel': 'rbf', 'random_state': 0}
'''
here SVC best score is 0.6411453133666112
'''
grid_search_svc.best_score_
0.6411453133666112
'''
HyperParameter Tunning (GridSearch)
2) For Logistic Regression
'''
grid={"C":np.logspace(1,3,10), "penalty":["l1","l2"]}# l1 lasso l2 ridge
logreg=LogisticRegression()
logreg_cv=GridSearchCV(logreg,grid,cv=10)
logreg_cv.fit(x_train,y_train)
GridSearchCV(cv=10, estimator=LogisticRegression(),
param_grid={'C': array([ 10. , 16.68100537, 27.82559402, 46.41588834,
77.42636827, 129.1549665 , 215.443469 , 359.38136638,
599.48425032, 1000. ]),
'penalty': ['l1', 'l2']})In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(cv=10, estimator=LogisticRegression(),
param_grid={'C': array([ 10. , 16.68100537, 27.82559402, 46.41588834,
77.42636827, 129.1549665 , 215.443469 , 359.38136638,
599.48425032, 1000. ]),
'penalty': ['l1', 'l2']})LogisticRegression()
LogisticRegression()
logreg_cv.best_params_
{'C': 10.0, 'penalty': 'l2'}
logreg_cv.best_score_
0.6401417497231451
'''
HyperParameter Tunning (GridSearch)
3) For KNN
'''
knn_classifer=KNeighborsClassifier()
params = [{'n_neighbors': [3, 5, 7, 9],
'weights': ['uniform', 'distance'],
'algorithm':['ball_tree','kd_tree','brute'],
'metric':['cityblock','cosine','euclidean','l1','l2','haversine','manhattan','nan_euclidean','minkowski'],
'leaf_size': [15, 40]}]
grid_search_knn = GridSearchCV(knn_classifer,
param_grid=params,
scoring='accuracy')
grid_search_knn.fit(x_train, y_train)
GridSearchCV(estimator=KNeighborsClassifier(),
param_grid=[{'algorithm': ['ball_tree', 'kd_tree', 'brute'],
'leaf_size': [15, 40],
'metric': ['cityblock', 'cosine', 'euclidean', 'l1',
'l2', 'haversine', 'manhattan',
'nan_euclidean', 'minkowski'],
'n_neighbors': [3, 5, 7, 9],
'weights': ['uniform', 'distance']}],
scoring='accuracy')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(estimator=KNeighborsClassifier(),
param_grid=[{'algorithm': ['ball_tree', 'kd_tree', 'brute'],
'leaf_size': [15, 40],
'metric': ['cityblock', 'cosine', 'euclidean', 'l1',
'l2', 'haversine', 'manhattan',
'nan_euclidean', 'minkowski'],
'n_neighbors': [3, 5, 7, 9],
'weights': ['uniform', 'distance']}],
scoring='accuracy')KNeighborsClassifier()
KNeighborsClassifier()
grid_search_knn.best_params_
{'algorithm': 'brute',
'leaf_size': 15,
'metric': 'cosine',
'n_neighbors': 7,
'weights': 'uniform'}
grid_search_knn.best_score_
0.6221741541874654
'''
after using GridSearchCV over models: SVC , Logistic Regression, KNN.
The result is that SVC has the best score.
'''
'\nafter using GridSearchCV over models: SVC , Logistic Regression, KNN.\nThe result is that SVC has the best score.\n'
from imblearn.over_sampling import RandomOverSampler
sampler =RandomOverSampler()
x,y = sampler.fit_resample(x_train,y_train)
'''
when use classificaion the result being better than using rgression as i explained below in the below comment.
'''
models={
'log_reg':LogisticRegression(),
'KNN':KNeighborsClassifier(n_neighbors=5),
'SVC':SVC(),
'NB':GaussianNB(),
'DT':DecisionTreeClassifier(),
'RF':RandomForestClassifier(n_estimators=25,n_jobs=-1),
'Bagging_classifier':BaggingClassifier(DecisionTreeClassifier(),n_estimators=5,n_jobs=-1),
'voting': VotingClassifier(estimators=[('LR',LogisticRegression()),('NB',GaussianNB()),('DT',DecisionTreeClassifier())])
}
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,recall_score,precision_score
import joblib
for name,model in models.items():
print('--------- ',name,'-------------')
model.fit(x_train,y_train)
y_pred=model.predict(x_test)
print('accuracy_training: ',accuracy_score(y_train,model.predict(x_train)))
print('accuracy_testing: ',accuracy_score(y_pred,y_test))
print('confusion matrix: ',confusion_matrix(y_pred,y_test))
print('recall score: ',recall_score(y_pred,y_test,average='micro'))
print('precision score: ',precision_score(y_pred,y_test,average='micro'))
joblib.dump(model,name+'_model.h5')
print('-'*30)
'''
the better result from classification is SVC.
'''
--------- log_reg ------------- accuracy_training: 0.6431424766977364 accuracy_testing: 0.6555851063829787 confusion matrix: [[ 0 0 0 0 0] [ 0 0 0 0 0] [ 0 0 0 0 0] [ 0 0 1 7 9] [ 19 25 42 163 486]] recall score: 0.6555851063829787 precision score: 0.6555851063829787 ------------------------------ --------- KNN ------------- accuracy_training: 0.5282956058588548 accuracy_testing: 0.4867021276595745 confusion matrix: [[ 0 0 0 0 0] [ 0 0 0 0 0] [ 0 0 0 1 1] [ 9 5 17 55 183] [ 10 20 26 114 311]] recall score: 0.4867021276595745 precision score: 0.4867021276595745 ------------------------------ --------- SVC ------------- accuracy_training: 0.6411451398135819 accuracy_testing: 0.6582446808510638 confusion matrix: [[ 0 0 0 0 0] [ 0 0 0 0 0] [ 0 0 0 0 0] [ 0 0 0 0 0] [ 19 25 43 170 495]] recall score: 0.6582446808510638 precision score: 0.6582446808510638 ------------------------------ --------- NB ------------- accuracy_training: 0.6208388814913449 accuracy_testing: 0.6356382978723404 confusion matrix: [[ 0 0 0 0 0] [ 0 0 0 0 0] [ 0 0 0 0 0] [ 3 6 12 44 61] [ 16 19 31 126 434]] recall score: 0.6356382978723404 precision score: 0.6356382978723404 ------------------------------ --------- DT ------------- accuracy_training: 0.6464713715046605 accuracy_testing: 0.648936170212766 confusion matrix: [[ 0 0 0 0 0] [ 0 0 0 0 0] [ 0 0 0 1 2] [ 2 2 1 19 24] [ 17 23 42 150 469]] recall score: 0.648936170212766 precision score: 0.648936170212766 ------------------------------ --------- RF ------------- accuracy_training: 0.6461384820239681 accuracy_testing: 0.6476063829787234 confusion matrix: [[ 0 0 0 0 0] [ 0 0 0 0 0] [ 0 0 0 1 2] [ 1 1 1 9 15] [ 18 24 42 160 478]] recall score: 0.6476063829787234 precision score: 0.6476063829787234 ------------------------------ --------- Bagging_classifier ------------- accuracy_training: 0.6448069241011984 accuracy_testing: 0.6529255319148937 confusion matrix: [[ 0 0 0 0 0] [ 0 0 0 0 0] [ 0 0 0 1 2] [ 2 4 1 26 28] [ 17 21 42 143 465]] recall score: 0.6529255319148937 precision score: 0.6529255319148937 ------------------------------ --------- voting ------------- accuracy_training: 0.6451398135818908 accuracy_testing: 0.651595744680851 confusion matrix: [[ 0 0 0 0 0] [ 0 0 0 0 0] [ 0 0 0 0 0] [ 2 2 1 19 24] [ 17 23 42 151 471]] recall score: 0.651595744680851 precision score: 0.651595744680851 ------------------------------
'\nthe better result from classification is SVC.\n'
'''
make this command in order to get features that i will use after that in deployment.
'''
df.columns
Index(['Rating', 'year', 'Hotel_name_0', 'Hotel_name_1', 'Hotel_name_2',
'City_Cannes', 'City_Geneva', 'City_Monaco', 'City_Nice',
'Country_Mexico', 'Country_Switzerland'],
dtype='object')
features = ['Rating', 'year', 'Hotel_name_0', 'Hotel_name_1', 'Hotel_name_2',
'City_Cannes', 'City_Geneva', 'City_Monaco', 'City_Nice',
'Country_Mexico', 'Country_Switzerland']
'''
this to having feature and scaler that help me in deployment.
'''
joblib.dump(features,'features.h5')
joblib.dump(scaler,'scaler.h5')
['scaler.h5']
'''
when using regression by using below commands ; the result was so bad and there is accuracy was negative .
so i decide to use
classification so in this case results be fine as per above and the better logorithm is SVC.
models={
'log_reg':LogisticRegression(),
'SVC':SVC(),
'DT':DecisionTreeRegressor(),
'RF':RandomForestRegressor(n_estimators=25,n_jobs=-1),
'Bagging_classifier':BaggingRegressor(DecisionTreeRegressor(),n_estimators=5,n_jobs=-1),
'xgboost':GradientBoostingRegressor(random_state=0),
}
for name,model in models.items():
print('--------- ',name,'-------------')
model.fit(x_train,y_train)
print('R2 train score: ',model.score(x_train,y_train))
print('R2_test score: ',model.score(x_test,y_test))
print('-'*30)
'''
"\nwhen using regression by using below commands ; the result was so bad and KNN accuracy was negative and also when using \nthe regression i was need to drop day , year , month and this will make app so poor.\n\nso i decide to use\nclassification so in this case results be fine as per above and the better logorithm is SVC.\n\n\nmodels={\n 'log_reg':LogisticRegression(),\n \n 'SVC':SVC(),\n 'DT':DecisionTreeRegressor(),\n 'RF':RandomForestRegressor(n_estimators=25,n_jobs=-1),\n 'Bagging_classifier':BaggingRegressor(DecisionTreeRegressor(),n_estimators=5,n_jobs=-1),\n 'xgboost':GradientBoostingRegressor(random_state=0),\n \n}\n\nfor name,model in models.items():\n print('--------- ',name,'-------------')\n model.fit(x_train,y_train)\n \n print('R2 train score: ',model.score(x_train,y_train))\n print('R2_test score: ',model.score(x_test,y_test))\n \n print('-'*30)\n \n \n\n\n"
df
| Rating | year | Hotel_name_0 | Hotel_name_1 | Hotel_name_2 | City_Cannes | City_Geneva | City_Monaco | City_Nice | Country_Mexico | Country_Switzerland | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 10 | 2023 | 0 | 0 | 1 | True | False | False | False | False | False |
| 1 | 10 | 2023 | 0 | 0 | 1 | True | False | False | False | False | False |
| 2 | 8 | 2022 | 0 | 0 | 1 | True | False | False | False | False | False |
| 3 | 10 | 2022 | 0 | 0 | 1 | True | False | False | False | False | False |
| 4 | 10 | 2021 | 0 | 0 | 1 | True | False | False | False | False | False |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 3805 | 6 | 2021 | 1 | 1 | 0 | False | False | False | False | True | False |
| 3806 | 8 | 2021 | 1 | 1 | 0 | False | False | False | False | True | False |
| 3807 | 8 | 2021 | 1 | 1 | 0 | False | False | False | False | True | False |
| 3808 | 10 | 2021 | 1 | 1 | 0 | False | False | False | False | True | False |
| 3809 | 10 | 2021 | 1 | 1 | 0 | False | False | False | False | True | False |
3756 rows × 11 columns
'''
- high correlation filter result ==> there is better correlation between year and Rating while
has bad correlation between day and month so i will drop them.
- HyperParmeter Tunning Result ==> use GridsearchCV as it has more accuracy ; and the result after using it is that
the best alogorithm is SVC.
- aftr applying several algorithm the better model is SVC as it has more accuracy_training and accuracy_testing.
'''